from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import csv
with open('output/themes/Boirefumersedroguer2.csv','r') as f:
reader = csv.reader(f, delimiter="|")
documents = []
for r in reader:
documents.append(r[3])
vectorizer =TfidfVectorizer(min_df=4, max_features = 5000)
#fit_transform(raw_documents[, y]) Learn vocabulary and idf, return term-document matrix.
vz = vectorizer.fit_transform(documents)
#create a dictionary out of the feature names and their corresponding idf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
#print("dublin: " + str(tfidf["violer"]))
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz[:5000])
svd_tfidf.shape
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
tsne_tfidf.shape
tsne_tfidf[0]
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
output_notebook()
plot_tfidf = bp.figure(plot_width=900, plot_height=700, title="Ciao violences questions (tf-idf)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
source=bp.ColumnDataSource({
"question": documents[:5000]
}))
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"Question": "@question"}
show(plot_tfidf)
from sklearn.cluster import MiniBatchKMeans
num_clusters = 6
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
init_size=1000, batch_size=100, verbose=False, max_iter=5000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
print("Cluster %d:" % i)
for j in sorted_centroids[i, :10]:
print(' %s' % terms[j])
print()
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:5000])
import numpy as np
colormap = np.array([
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])
plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="Web Summit 2015 tweets (k-means)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1],
color=colormap[kmeans_clusters][:10000],
source=bp.ColumnDataSource({
"question": documents[:5000],
"cluster": kmeans_clusters[:5000]
}))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"Question": "@question"}
show(plot_kmeans)